kSanity-VIRGO metagenomics counts QC

Author

Laura Vermeren & Symul

Published

April 17, 2025

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
library(labelled)
library(vegan)
library(RColorBrewer)

# tmp <- fs::dir_map("R scripts mg/", source)
tmp <- fs::dir_map("R scripts/", source)

theme_set(theme_light())

Load the data

Code
data_source <- "real"
data_dir <- get_data_dir(data_source)

if (data_source == TRUE) {
  mg_dir <- str_c(data_dir, "03 metagenomics combined/")
  counts <- read_csv(str_c(mg_dir, "mg_combined.csv"))
  manifest <- read_csv(str_c(mg_dir, "mg_combined_manifest.csv"))
} else {
  mg_dir <- str_c(data_dir, "02 Metagenomics/")
  counts <- read_csv(str_c(mg_dir, "MVIBR_kSanityVIRGO2_ReadCounts_20250404.csv"))
  counts_corr <- read_csv(str_c(mg_dir, "MVIBR_kSanityVIRGO2_taxaGLcor_20250404.csv"))
  relabs <- read_csv(str_c(mg_dir, "MVIBR_kSanityVIRGO2_RelAbund_20250404.csv"))
  technical_metadata <- read_csv(str_c(mg_dir, "VIBRANT_MG_technicalMetaData_20250404.csv"))
  LBP_strain_info <- readxl::read_xlsx(str_c(data_dir, "00 Trial Data/IsolateNumbers.xlsx"))
}

The names of the strains do not correspond exactly to those in the metagenomics files. We will modify the names of the strains in LBP_strain_info to match them.

Code
LBP_strain_info <- 
  LBP_strain_info |> 
  mutate(strain_id = `VMRC ID`, 
         LBP = ifelse(is.na(LC106), "LC-115", "LC-106 & LC-115") |> factor(), 
         strain_origin = `Geographic source` |> factor()
  ) |>
  dplyr::rename(Biose_ID = `Biose ID`) |> #VMRC_ID = `VMRC ID` 
  dplyr::select(strain_id, LBP, strain_origin, Biose_ID) |> 
  arrange(strain_origin, LBP) |> 
  mutate(strain_id = strain_id |> fct_inorder()) |> 
  dplyr::select(strain_id, LBP, strain_origin, contains("ID")) |> 
  mutate(strain_id = sub("_.*", "", strain_id),
         strain_id = case_when(strain_id == "CC0028A1" ~ "C0028A1", 
                               TRUE ~ strain_id
    ))

LBP_strain_info |>   
  set_variable_labels(
    strain_id = "Strain ID", 
    strain_origin = "Strain origin", 
    Biose_ID = "Biose ID", 
    #VMRC_ID = "VMRC ID"
  ) |> 
  gt(caption = "LBP strain information") |> 
  tab_style(style = cell_text(weight = "bold"),
            locations = cells_column_labels())
LBP strain information
Strain ID LBP Strain origin Biose ID
FF00018 LC-106 & LC-115 SA GA08
FF00051 LC-106 & LC-115 SA GA09
UC101 LC-106 & LC-115 SA GA12
FF00004 LC-115 SA GA07
FF00064 LC-115 SA GA10
FF00072 LC-115 SA GA11
UC119 LC-115 SA GA13
122010 LC-115 SA GA14
185329 LC-115 SA GA15
C0059E1 LC-106 & LC-115 US GA03
C0022A1 LC-106 & LC-115 US GA04
C0175A1 LC-106 & LC-115 US GA06
C0006A1 LC-115 US GA01
C0028A1 LC-115 US GA02
C0112A1 LC-115 US GA05

Create a SummarizedExperiment object

NOTE : not possible for the moment because the number of columns does not correspond between the different assays. counts includes an extra column “multiGenera”. For now, we remove the “MuliGenra” from the counts data frame.

Code
mg_to_SE <- function(counts, counts_corr, relabs, LBP_strain_info, technical_metadata) {
  
  technical_metadata <- 
    technical_metadata |> 
    filter(is.na(LibrarySequencedTwice)) 
  
  # suppress "multiGenera" from counts
  counts <- 
    counts |> 
    select(-c(MultiGenera))

  assay_counts <- 
    counts |> 
    dplyr::select(-c(CST, subCST, score)) |> 
    as.data.frame() |> 
    column_to_rownames("sampleID") |> 
    drop_na() |> 
    t() 
  
  assay_counts_corr <- 
    counts_corr |> 
    dplyr::select(-c(CST, subCST, score)) |>
    as.data.frame() |>
    column_to_rownames("sampleID") |>
    t()
  
  assay_relative_ab <- 
    relabs |> 
    dplyr::select(-c(CST, subCST, score)) |>
    as.data.frame() |> 
    column_to_rownames("sampleID") |>
    t()
    
  
  # include a total reads per sample in the colData 
  se_coldata <-
    counts |> 
    rowwise() |> 
    mutate(total_non_human_reads = sum(c_across(-c(sampleID, CST, subCST, score)))) |>
    select(sampleID, total_non_human_reads) |> 
    left_join(
      technical_metadata |> filter(is.na(LibrarySequencedTwice)), 
      by = c("sampleID" = "UID")
    ) |>
    as.data.frame()
  
  se_rowdata <- 
    counts |> 
    dplyr::select(-c(sampleID, CST, subCST, score)) |>
    colnames() |> 
    as.data.frame() |> 
    setNames("strain") |> 
    distinct() |> 
    left_join(
      LBP_strain_info,
      by = join_by(strain == strain_id)
    ) 
  
  # Harmonization of the order of samples and feature
  
  sorted_sample_ids <- sort(as.character(se_coldata$sampleID))
  assay_counts <- assay_counts[, sorted_sample_ids]
  assay_counts_corr <- assay_counts_corr[, sorted_sample_ids]
  assay_relative_ab <- assay_relative_ab[, sorted_sample_ids]
  se_coldata <- se_coldata[match(sorted_sample_ids, se_coldata$sampleID), ]

  sorted_strains <- assay_relative_ab |> rownames() # sort(as.character(se_rowdata$strain))
  assay_counts <- assay_counts[sorted_strains, ]
  assay_counts_corr <- assay_counts_corr[sorted_strains, ]
  assay_relative_ab <- assay_relative_ab[sorted_strains, ]
  se_rowdata <- se_rowdata[match(sorted_strains, se_rowdata$strain), ]

  SummarizedExperiment::SummarizedExperiment(
    assays = list(counts = assay_counts, counts_corr = assay_counts_corr, relabs = assay_relative_ab),
    rowData = se_rowdata,
    colData = se_coldata
  )
}
Code
SE_mg <- mg_to_SE(counts, counts_corr, relabs, LBP_strain_info, technical_metadata)

(include a total count per sample in the colData total_non_human_reads) > OK

assay_name = counts

Exploratory & QC analyses

Code
SE_mg |> as.tibble()
# A tibble: 751,548 × 29
   .feature .sample   counts counts_corr relabs sampleID total_non_human_reads
   <chr>    <chr>      <dbl>       <dbl>  <dbl> <chr>                    <dbl>
 1 122010   MG_1010       0           0  0      MG_1010               9671181.
 2 185329   MG_1010       0           0  0      MG_1010               9671181.
 3 C0006A1  MG_1010       0           0  0      MG_1010               9671181.
 4 C0022A1  MG_1010 1408234.       1494. 0.137  MG_1010               9671181.
 5 C0028A1  MG_1010       0           0  0      MG_1010               9671181.
 6 C0059E1  MG_1010  268235.        285. 0.0261 MG_1010               9671181.
 7 C0112A1  MG_1010       0           0  0      MG_1010               9671181.
 8 C0175A1  MG_1010 6571758.       6971. 0.639  MG_1010               9671181.
 9 FF00004  MG_1010       0           0  0      MG_1010               9671181.
10 FF00018  MG_1010       0           0  0      MG_1010               9671181.
# ℹ 751,538 more rows
# ℹ 22 more variables: SampleType <chr>, Ext_Lib_Plate <dbl>,
#   Ext_Lib_Plate_ID <dbl>, Ext_Lib_Position <chr>, SequencingRun <chr>,
#   DateSequenced <dbl>, Lane <dbl>, Sample <chr>, `Library Pool` <chr>,
#   Library <chr>, FragmentSize <dbl>, IndexSet <chr>, `Index 1` <chr>,
#   `Index 2` <chr>, BioinformaticsProcessingBatch <dbl>,
#   `Selected4re-extraction` <dbl>, LibrarySequencedTwice <dbl>, Notes <chr>, …

###Total number of counts/relative abundances per sample

Total number of counts and corrected counts per sample

Code
SE_mg |> 
  as.tibble() |> 
  group_by(.sample) |> 
  summarise(total_count = sum(counts)) |>
  ggplot(aes(x = total_count)) +
  geom_histogram() +
  scale_x_log10() +
  labs(x = "Total number of counts per sample", 
       y = "Number of samples")

Code
SE_mg |> 
  as.tibble() |> 
  group_by(.sample) |> 
  summarise(total_count = sum(counts_corr)) |>
  ggplot(aes(x = total_count)) +
  geom_histogram() +
  scale_x_log10() +
  labs(x = "Total number of corected counts per sample", 
       y = "Number of samples")

A sample has a very small total number of counts

Code
SE_mg |> 
  group_by(.sample) |>
  as.tibble() |> 
  filter(total_non_human_reads < 1e5) |>
  distinct(.sample) |> 
  print()
# A tibble: 1 × 1
  .sample      
  <chr>        
1 MG_EQ05822476

Total proportion per sample

Code
SE_mg |> 
  as.tibble() |> 
  group_by(.sample) |>
  summarise(total_relab = sum(relabs)) |> 
  ggplot(aes(x = total_relab)) +
  geom_histogram() + 
  labs(x = "Total proportion per sample", 
       y = "Number of samples")

Missing values

Code
SE_mg |> 
  as.tibble() |> 
  ggplot(aes(x=.feature, y = .sample, fill = is.na(counts))) + 
  geom_tile() + 
  scale_x_discrete("Strains") +
  scale_y_discrete("Species",breaks = NULL) +
  labs(title = "Missing values in readcounts") +
  scale_fill_manual(
    name = "Statut",
    values = c("TRUE" = "pink", "FALSE" = "steelblue2"),  
    labels = c("FALSE" = "Not missing", "TRUE" = "Missing")) + 
  theme(axis.text.x = element_blank(),  
        axis.ticks.x = element_blank(), 
        plot.title = element_text(hjust = 0.5))

Code
SE_mg |> 
  as.tibble() |> 
  ggplot(aes(x=.feature, y = .sample, fill = is.na(counts_corr))) + 
  geom_tile() + 
  scale_x_discrete("Species") +
  scale_y_discrete("Samples",breaks = NULL) +
  labs(title = "Missing values in corrected readcounts") +
  scale_fill_manual(
    name = "Statut",
    values = c("TRUE" = "pink", "FALSE" = "steelblue2"),  
    labels = c("FALSE" = "Not missing", "TRUE" = "Missing")) + 
  theme(axis.text.x = element_blank(),  
        axis.ticks.x = element_blank(), 
        plot.title = element_text(hjust = 0.5))

Code
SE_mg |> 
  as.tibble() |> 
  ggplot(aes(x=.feature, y = .sample, fill = is.na(relabs))) + 
  geom_tile() + 
  scale_x_discrete("Species") +
  scale_y_discrete("Samples",breaks = NULL) +
  labs(title = "Missing values in relative abundance") +
  scale_fill_manual(
    name = "Statut",
    values = c("TRUE" = "pink", "FALSE" = "steelblue2"),  
    labels = c("FALSE" = "Not missing", "TRUE" = "Missing")) + 
  theme(axis.text.x = element_blank(),  
        axis.ticks.x = element_blank(), 
        plot.title = element_text(hjust = 0.5))

Proportion of LBP strains per sample

Code
SE_mg |> 
  as.tibble() |>  
  filter(!is.na(Biose_ID)) |> 
  ggplot(aes(x = .feature, y = .sample, fill = relabs)) +
  geom_tile() +
  scale_fill_continuous(low = "white", high = "steelblue2") +
  scale_x_discrete("Strains") +
  scale_y_discrete("Samples",breaks = NULL) +
  facet_grid(. ~ LBP + strain_origin, scales = "free_x", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Co-occurrence analysis

Is there a strain that is only present when the others are not?

TODO : improve plot (same order of taxa, why there is an X in front of LBP names in y ??)

Code
presence_absence <- 
  SE_mg |> 
  as.tibble() |>  
  filter(!is.na(Biose_ID)) |> 
  select(.feature, .sample, relabs) |> 
  pivot_wider(names_from = .feature, values_from = relabs, values_fill = 0) |>
  mutate(across(where(is.numeric), ~ ifelse(. > 0, 1, 0))) |> 
  select(-.sample)

cooccurrence <- presence_absence |>
  summarise(across(everything(), sum)) |>
  pivot_longer(cols = everything(), names_to = "taxon", values_to = "presence_count")

cooccurrence_matrix <- presence_absence |>
  summarise(across(everything(), list)) |>
  pivot_longer(cols = everything(), names_to = "taxon", values_to = "presence_list") |>
  mutate(presence_list = map(presence_list, unlist)) |>
  pull(presence_list)

cooccur_mat = matrix(0, nrow = length(cooccurrence_matrix), ncol = length(cooccurrence_matrix))
rownames(cooccur_mat) = colnames(presence_absence)
colnames(cooccur_mat) = colnames(presence_absence)

for (i in 1:length(cooccurrence_matrix)){
  for (j in 1:length(cooccurrence_matrix)){
    cooccur_mat[i,j] = sum(cooccurrence_matrix[[i]] & cooccurrence_matrix[[j]])
  }
}

exclusive_strains <- data.frame(cooccur_mat) |>
  mutate(strain = rownames(cooccur_mat)) |>
  pivot_longer(!strain, names_to = "other_strain", values_to = "cooccur") |>
  group_by(strain) |>
  summarise(total_cooccur = sum(cooccur)) |>
  filter(total_cooccur == 0)

print(exclusive_strains)
# A tibble: 0 × 2
# ℹ 2 variables: strain <chr>, total_cooccur <dbl>
Code
cooccur_melted = data.frame(cooccur_mat) |>
  mutate(strain = rownames(cooccur_mat)) |>
  pivot_longer(!strain, names_to = "other_strain", values_to = "cooccur")

ggplot(cooccur_melted, aes(x = strain, y = other_strain, fill = cooccur)) +
  geom_tile() +
  scale_fill_continuous(low = "white", high = "steelblue2") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Control sample

LBP abundance in control samples

Code
SE_mg |> 
  as.tibble() |>  
  filter(!is.na(Biose_ID))|>
  filter(SampleType != "ClinicalSample") |> 
  ggplot(aes(x = .feature, y = .sample, fill = relabs)) +
  geom_tile() +
  scale_fill_continuous(low = "white", high = "steelblue2") +
  scale_x_discrete("Strains") +
  scale_y_discrete("Samples") +
  facet_grid(SampleType ~ LBP + strain_origin, scales = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Code
top20_species_all <-
  SE_mg |>
  as.tibble() |>
  group_by(.feature) |>
  summarise(total_relabs = sum(relabs)) |>
  arrange(desc(total_relabs)) |>
  slice_head(n = 20) |>
  group_by(color = case_when(
    .feature %in% LBP_strain_info$strain_id ~ colorRampPalette(c("lightpink", "hotpink3"))(n()),
    str_detect(.feature, "Lactobacillus") ~ colorRampPalette(c("chocolate1","chocolate4"))(n()),
    str_detect(.feature, "Gardnerella") ~ colorRampPalette(c("darkolivegreen1", "darkolivegreen4"))(n()),
    str_detect(.feature, "Prevotella") ~ colorRampPalette(c("lightblue", "deepskyblue3"))(n()),
    TRUE ~ colorRampPalette(c("antiquewhite", "antiquewhite3"))(n())
  )) 

top20_species_control <-
  SE_mg |>
  as.tibble() |>
  filter(SampleType != "ClinicalSample") |>
  group_by(.feature) |>
  summarise(total_relabs = sum(relabs)) |>
  arrange(desc(total_relabs)) |>
  slice_head(n = 20) |>
  group_by(color = case_when(
    .feature %in% LBP_strain_info$strain_id ~ colorRampPalette(c("lightpink", "hotpink3"))(n()),
    str_detect(.feature, "Lactobacillus") ~ colorRampPalette(c("chocolate1","chocolate4"))(n()),
    str_detect(.feature, "Gardnerella") ~ colorRampPalette(c("darkolivegreen1", "darkolivegreen4"))(n()),
    str_detect(.feature, "Prevotella") ~ colorRampPalette(c("lightblue", "deepskyblue3"))(n()),
    TRUE ~ colorRampPalette(c("antiquewhite", "antiquewhite3"))(n())
  )) 


SE_mg |> 
  as.tibble() |>
  filter(.feature %in% all_of(top20_species_all$.feature)) |>
  filter(SampleType != "ClinicalSample") |>
  ggplot(aes(x = .feature, y = .sample, fill = relabs)) +
  geom_tile() +
  scale_fill_continuous(low = "white", high = "steelblue2") +
  scale_x_discrete("Species") +
  scale_y_discrete("Samples") +
  facet_grid(SampleType ~ ., scales = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Code
SE_mg |>
  as.tibble() |>
  filter(.feature %in% all_of(top20_species_all$.feature)) |>
  left_join(
    top20_species_all |> select(.feature, color),
    by = ".feature"
  ) |>
  filter(SampleType != "ClinicalSample") |>
  mutate(.feature = factor(.feature, levels = sort(unique(.feature)))) |> 
  ggplot(aes(x = .sample, y = relabs, fill = .feature)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ SampleType, scales = "free_x") +
  labs(
    x = "Samples",
    y = "Relative Abundance",
    title = "Controle sample : Top 20 species of all samples",
    fill = "Species"
  ) +
  scale_x_discrete("Samples", breaks = NULL) +
  scale_fill_manual(
    values = setNames(top20_species_all$color, top20_species_all$.feature),
    breaks = sort(unique(top20_species_all$.feature)), 
    labels = sort(unique(top20_species_all$.feature))  
  ) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1), 
        plot.title = element_text(hjust = 0.5))

Code
SE_mg |>
  as.tibble() |>
  filter(.feature %in% all_of(top20_species_control$.feature)) |>
  left_join(
    top20_species_control |> select(.feature, color),
    by = ".feature"
  ) |>
  filter(SampleType != "ClinicalSample") |>
  mutate(.feature = factor(.feature, levels = sort(unique(.feature)))) |> 
  ggplot(aes(x = .sample, y = relabs, fill = .feature)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ SampleType, scales = "free_x") +
  labs(
    x = "Samples",
    y = "Relative Abundance",
    title = "Controle sample : Top 20 species of control samples",
    fill = "Species"
  ) +
  scale_x_discrete("Samples", breaks = NULL) +
  scale_fill_manual(
    values = setNames(top20_species_control$color, top20_species_control$.feature),
    breaks = sort(unique(top20_species_control$.feature)), 
    labels = sort(unique(top20_species_control$.feature))  
  ) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1), 
      plot.title = element_text(hjust = 0.5))

Technical metadata

Code
data_plot <-
  technical_metadata |>
  select(-c(UID, Sample, SampleType, Library, Notes, FragmentSize, IndexSet, `Index 1`, `Index 2`, Ext_Lib_Position)) |>
  mutate(across(everything(), as.factor)) |>
  rename(LibraryPool = `Library Pool`,
         Selected4re_extraction = `Selected4re-extraction`)

generate_plot <- function(df, x_var, fill_var) {
  n_levels_fill <- nlevels(df[[fill_var]])
  if (n_levels_fill <= 12) {
    ggplot(df, aes_string(x = x_var, fill = fill_var)) +
      geom_bar(position = "stack") +
      theme_bw() +
      scale_fill_manual(values = brewer.pal(n = n_levels_fill, name = "Set3")) +
      labs(x = x_var, fill = fill_var) +
      theme(axis.text.x = element_text(angle = 90, hjust = 1))
  } else {
    message(cat("Trop de niveaux : x = ",x_var, "fill =", fill_var))
    invisible(NULL) # Retourne NULL pour que walk() ne tente pas d'imprimer un message
  }
}

factor_vars <- names(data_plot)

combinations <- cross(list(x = factor_vars, fill = factor_vars)) %>%
  keep(~ which(.x$x == factor_vars) < which(.x$fill == factor_vars)) 

# Appliquer la fonction de génération de graphique à chaque combinaison
walk(combinations, ~ print(generate_plot(data_plot, .x$x, .x$fill)))

Trop de niveaux : x =  Ext_Lib_Plate fill = LibraryPoolNULL
Trop de niveaux : x =  Ext_Lib_Plate_ID fill = LibraryPoolNULL
Trop de niveaux : x =  SequencingRun fill = LibraryPoolNULL
Trop de niveaux : x =  DateSequenced fill = LibraryPoolNULL
Trop de niveaux : x =  Lane fill = LibraryPoolNULL

PCoA on counts

Code
dist_matrix <-
  vegdist(
    assay(SE_mg, "counts") |> t(), 
    method = "bray")

pcoa <- wcmdscale(dist_matrix, eig = TRUE)
print(pcoa)
Call: wcmdscale(d = dist_matrix, eig = TRUE)

          Inertia Rank
Total      359.39     
Real       428.63  344
Imaginary  -69.24  621

Results have 966 points, 344 axes

Eigenvalues:
  [1] 60.97 50.39 33.62 21.71 15.98 13.65 12.50 10.89  9.51  8.84  8.08  6.79
 [13]  6.25  6.08  5.51  4.77  4.59  4.39  4.38  4.21  4.07  3.73  3.61  3.32
 [25]  3.15  3.07  2.94  2.81  2.69  2.58  2.46  2.45  2.32  2.27  2.20  2.10
 [37]  2.04  1.96  1.87  1.80  1.71  1.62  1.56  1.54  1.52  1.44  1.40  1.38
 [49]  1.32  1.29  1.24  1.21  1.18  1.17  1.12  1.10  1.08  1.07  1.05  1.03
 [61]  1.00  0.97  0.95  0.93  0.89  0.88  0.87  0.85  0.82  0.80  0.79  0.77
 [73]  0.75  0.74  0.72  0.72  0.71  0.68  0.67  0.66  0.64  0.63  0.62  0.61
 [85]  0.60  0.59  0.58  0.57  0.55  0.54  0.53  0.52  0.51  0.51  0.50  0.49
 [97]  0.48  0.48  0.47  0.46  0.45  0.45  0.44  0.43  0.42  0.42  0.41  0.41
[109]  0.40  0.40  0.39  0.39  0.38  0.38  0.37  0.36  0.36  0.35  0.34  0.34
(Showing 120 of 965 eigenvalues)

Weights: Constant
Code
pcoa$eig |>
  as.data.frame() |>
  rownames_to_column("axis") |>
  mutate(axis = str_remove(axis, "Eigenvalue"), 
         var_explained = 100*pcoa$eig/sum(pcoa$eig)) |>
  slice(1:10) |>
  gt() |>
  tab_header(title = "Eigenvalues of the PCoA") |>
  cols_label(axis = "Axis", `pcoa$eig` = "Eigenvalue", var_explained = "Variance explained") |>
  fmt_number(columns = vars(`pcoa$eig`, var_explained), decimals = 3)
Eigenvalues of the PCoA
Axis Eigenvalue Variance explained
1 60.975 16.966
2 50.394 14.022
3 33.616 9.353
4 21.708 6.040
5 15.980 4.446
6 13.648 3.798
7 12.496 3.477
8 10.893 3.031
9 9.509 2.646
10 8.836 2.459
Code
pcoa$eig |> 
  as_data_frame() |>
  rownames_to_column("axis") |>
  mutate(axis = axis |> as.numeric()) |>
  ggplot(aes(x = axis, y = value)) +
  geom_col() +
  labs(
    x= "Axis",
    y = "Eigenvalue"
  ) + 
  theme_bw()

Code
pcoa_data <- 
  as.data.frame(pcoa$points) |> 
  mutate(sampleID = counts$sampleID) |> 
  left_join(
    technical_metadata |> filter(is.na(LibrarySequencedTwice)), 
    by = c("sampleID" = "UID")
  )
Code
pcoa_plot <- function(pcoa, color_var, axes = 1:2){
  
  pcoa_data <- pcoa_data |> 
    mutate(!!sym(color_var) := as.factor(!!sym(color_var)))

  pcoa_data |>
    ggplot(aes(x = !!sym(paste0("Dim", axes[1])), y = !!sym(paste0("Dim", axes[2])), color = !!sym(color_var))) +
    geom_point(size = 2, alpha = 0.7) +
    coord_fixed() +
    labs(
      x = paste0("PCoA ", axes[1], " (", round(100 * pcoa$eig[axes[1]] / sum(pcoa$eig), 1), "%)"),
      y = paste0("PCoA ", axes[2], " (", round(100 * pcoa$eig[axes[2]] / sum(pcoa$eig), 1), "%)"),
      color = color_var
    ) + 
    theme_bw()
}

Type of sample

Code
pcoa_plot(pcoa, "SampleType", c(1, 2))

Code
pcoa_plot(pcoa, "SampleType", c(3, 4))

Code
pcoa_plot(pcoa, "SampleType", c(5, 6))

Code
manova <- adonis2(
  dist_matrix ~ SampleType,
  data = pcoa_data
)

manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix ~ SampleType, data = pcoa_data)
            Df SumOfSqs      R2     F Pr(>F)    
SampleType   4     4.42 0.01229 2.989  0.001 ***
Residual   961   354.97 0.98771                 
Total      965   359.39 1.00000                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Plate

Code
pcoa_plot(pcoa, "Ext_Lib_Plate", c(1, 2))

Code
pcoa_plot(pcoa, "Ext_Lib_Plate", c(3, 4))

Code
pcoa_plot(pcoa, "Ext_Lib_Plate", c(5, 6))

Permutation test

Code
manova <- adonis2(
  dist_matrix ~ Ext_Lib_Plate,
  data = pcoa_data
)

manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix ~ Ext_Lib_Plate, data = pcoa_data)
               Df SumOfSqs      R2      F Pr(>F)    
Ext_Lib_Plate   1     4.57 0.01272 12.423  0.001 ***
Residual      964   354.82 0.98728                  
Total         965   359.39 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Batch

Code
pcoa_plot(pcoa, "BioinformaticsProcessingBatch", c(1, 2))

Code
pcoa_plot(pcoa, "BioinformaticsProcessingBatch", c(3, 4))

Code
pcoa_plot(pcoa, "BioinformaticsProcessingBatch", c(5, 6))

Code
manova <- adonis2(
  dist_matrix ~ BioinformaticsProcessingBatch,
  data = pcoa_data
)

manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix ~ BioinformaticsProcessingBatch, data = pcoa_data)
                               Df SumOfSqs      R2      F Pr(>F)    
BioinformaticsProcessingBatch   1     3.21 0.00894 8.6962  0.001 ***
Residual                      964   356.18 0.99106                  
Total                         965   359.39 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Library Pool

Code
pcoa_plot(pcoa, "Library Pool", c(1, 2))

Code
pcoa_plot(pcoa, "Library Pool", c(3, 4))

Code
pcoa_plot(pcoa, "Library Pool", c(5, 6))

Code
manova <- adonis2(
  dist_matrix ~ `Library Pool`,
  data = pcoa_data
)

manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix ~ `Library Pool`, data = pcoa_data)
                Df SumOfSqs      R2      F Pr(>F)    
`Library Pool`  17    16.13 0.04487 2.6199  0.001 ***
Residual       948   343.26 0.95513                  
Total          965   359.39 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Lane

Code
pcoa_plot(pcoa, "Lane", c(1, 2))

Code
pcoa_plot(pcoa, "Lane", c(3, 4))

Code
pcoa_plot(pcoa, "Lane", c(5, 6))

Code
manova <- adonis2(
  dist_matrix ~ Lane,
  data = pcoa_data
)

manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix ~ Lane, data = pcoa_data)
          Df SumOfSqs      R2      F Pr(>F)  
Lane       1     0.66 0.00185 1.7847  0.037 *
Residual 964   358.73 0.99815                
Total    965   359.39 1.00000                
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

PCoA on relative abundances

Code
dist_matrix_relab <-
  vegdist(
    relabs |> dplyr::select(-c(CST, subCST, score)) |> column_to_rownames("sampleID"), 
    method = "bray")

pcoa_relab <- wcmdscale(dist_matrix_relab, eig = TRUE)
print(pcoa_relab)
Call: wcmdscale(d = dist_matrix_relab, eig = TRUE)

          Inertia Rank
Total      309.98     
Real       381.84  326
Imaginary  -71.86  639

Results have 966 points, 326 axes

Eigenvalues:
  [1] 71.59 59.11 39.07 19.66 15.30 12.04 10.41  8.62  7.37  6.53  5.77  5.39
 [13]  5.24  4.78  4.31  4.24  3.83  3.75  3.24  3.15  2.98  2.79  2.68  2.57
 [25]  2.24  2.16  2.08  1.94  1.77  1.73  1.59  1.54  1.53  1.41  1.36  1.35
 [37]  1.28  1.25  1.18  1.16  1.12  1.09  1.05  1.03  0.98  0.96  0.93  0.91
 [49]  0.88  0.87  0.82  0.80  0.79  0.78  0.76  0.75  0.74  0.70  0.69  0.67
 [61]  0.66  0.64  0.63  0.62  0.59  0.56  0.56  0.55  0.54  0.52  0.51  0.49
 [73]  0.49  0.49  0.48  0.47  0.46  0.45  0.44  0.43  0.42  0.41  0.41  0.40
 [85]  0.39  0.38  0.38  0.37  0.36  0.36  0.35  0.35  0.34  0.33  0.33  0.32
 [97]  0.31  0.30  0.30  0.29  0.29  0.28  0.28  0.27  0.26  0.26  0.26  0.26
[109]  0.25  0.24  0.24  0.24  0.24  0.23  0.23  0.23  0.22  0.22  0.21  0.21
(Showing 120 of 965 eigenvalues)

Weights: Constant
Code
pcoa_relab$eig |>
  as.data.frame() |>
  rownames_to_column("axis") |>
  mutate(axis = str_remove(axis, "Eigenvalue"), 
         var_explained = 100*pcoa$eig/sum(pcoa_relab$eig)) |>
  slice(1:10) |>
  gt() |>
  tab_header(title = "Eigenvalues of the PCoA") |>
  cols_label(axis = "Axis", `pcoa_relab$eig` = "Eigenvalue", var_explained = "Variance explained") |>
  fmt_number(columns = vars(`pcoa_relab$eig`, var_explained), decimals = 3)
Eigenvalues of the PCoA
Axis Eigenvalue Variance explained
1 71.593 19.670
2 59.108 16.257
3 39.074 10.844
4 19.664 7.003
5 15.305 5.155
6 12.043 4.403
7 10.414 4.031
8 8.623 3.514
9 7.367 3.068
10 6.533 2.850
Code
pcoa_relab$eig |> 
  as_data_frame() |>
  rownames_to_column("axis") |>
  mutate(axis = axis |> as.numeric()) |>
  ggplot(aes(x = axis, y = value)) +
  geom_col() +
  labs(
    x= "Axis",
    y = "Eigenvalue"
  ) + 
  theme_bw()

Code
pcoa_relab_data <- 
  as.data.frame(pcoa_relab$points) |> 
  mutate(sampleID = counts$sampleID) |> 
  left_join(
    technical_metadata |> filter(is.na(LibrarySequencedTwice)), 
    by = c("sampleID" = "UID")
  )

Type of sample

Code
pcoa_plot(pcoa_relab, "SampleType", c(1, 2))

Code
pcoa_plot(pcoa_relab, "SampleType", c(1, 2))

Code
pcoa_plot(pcoa_relab, "SampleType", c(1, 2))

Plate

Code
pcoa_plot(pcoa_relab, "Ext_Lib_Plate", c(1, 2))

Code
pcoa_plot(pcoa_relab, "Ext_Lib_Plate", c(3, 4))

Code
pcoa_plot(pcoa_relab, "Ext_Lib_Plate", c(5, 6))

Code
manova <- adonis2(
  dist_matrix_relab ~ Ext_Lib_Plate,
  data = pcoa_relab_data
)

manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix_relab ~ Ext_Lib_Plate, data = pcoa_relab_data)
               Df SumOfSqs      R2      F Pr(>F)    
Ext_Lib_Plate   1    4.345 0.01402 13.704  0.001 ***
Residual      964  305.639 0.98598                  
Total         965  309.984 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Batch

Code
pcoa_plot(pcoa_relab, "BioinformaticsProcessingBatch", c(1, 2))

Code
pcoa_plot(pcoa_relab, "BioinformaticsProcessingBatch", c(3, 4))

Code
pcoa_plot(pcoa_relab, "BioinformaticsProcessingBatch", c(5, 6))

Code
manova <- adonis2(
  dist_matrix_relab ~ BioinformaticsProcessingBatch,
  data = pcoa_relab_data
)

manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix_relab ~ BioinformaticsProcessingBatch, data = pcoa_relab_data)
                               Df SumOfSqs      R2      F Pr(>F)    
BioinformaticsProcessingBatch   1    3.212 0.01036 10.093  0.001 ***
Residual                      964  306.772 0.98964                  
Total                         965  309.984 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Library Pool

Code
pcoa_plot(pcoa_relab, "Library Pool", c(1, 2))

Code
pcoa_plot(pcoa_relab, "Library Pool", c(3, 4))

Code
pcoa_plot(pcoa_relab, "Library Pool", c(5, 6))

Code
manova <- adonis2(
  dist_matrix_relab ~ `Library Pool`,
  data = pcoa_relab_data
)

manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix_relab ~ `Library Pool`, data = pcoa_relab_data)
                Df SumOfSqs      R2      F Pr(>F)    
`Library Pool`  17   15.021 0.04846 2.8398  0.001 ***
Residual       948  294.963 0.95154                  
Total          965  309.984 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Lane

Code
pcoa_plot(pcoa_relab, "Lane", c(1, 2))

Code
pcoa_plot(pcoa_relab, "Lane", c(3, 4))

Code
pcoa_plot(pcoa_relab, "Lane", c(5, 6))

Code
manova <- adonis2(
  dist_matrix_relab ~ Lane,
  data = pcoa_relab_data
)

manova
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix_relab ~ Lane, data = pcoa_relab_data)
          Df SumOfSqs      R2      F Pr(>F)  
Lane       1    0.615 0.00198 1.9169  0.054 .
Residual 964  309.369 0.99802                
Total    965  309.984 1.00000                
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Exclude failed samples)

Build a taxonomic table

Add relative abundances

assay_name = rel_ab

Additional SE with bacteria only counts and relative abundances

Take the counts assay from SE1; subset the feature that are bacteria; compute total_bacterial_reads and rel_ab

Save SummarizedExperiment objects

Save the SE objects to disk